library(tidyverse) # Colección de paquetes para ciencia de datos (incluye ggplot2, dplyr, tidyr, readr, purrr, tibble)
library(reshape) # Herramientas para reorganizar datos
library(Hmisc) # resúmenes estadísticos
library(limma) # Análisis de datos de expresión genética
library(AnnotationDbi) # Interfaz para bases de datos de anotaciones bioinformáticas
library(org.Hs.eg.db) # Datos de anotación para genes humanos
library(VennDiagram) # Generación de diagramas de Venn 
library(gridExtra) # Mostrar varias gráficas
library(patchwork) # Combinar múltiples ggplots en un único plot
library(ggrepel) # Mejora la visualización de texto en ggplots evitando solapamientos de texto
library(Rtsne) # Implementación de t-SNE
library(umap) # Implementación de UMAP
library(ggVennDiagram) # diagrama de Venn
library(fastDummies)  # Para realizar el one-hot encoding
library(reactable) # para tablas interactivas
library(factoextra)
library(ggstatsplot) # para barras de porcentajes de las tablas de contingencia
library(UpSetR) # para diagramas de upset
library(gprofiler2) # enriquecimiento grofiler
library(clusterProfiler) # para enriquecimientos
library(enrichplot) # plots de enriquecimiento
library(DOSE) # needed to convert to enrichResult object

1 Leyendo datos

Descargar los datos. Escalo los datos de la matriz para tener homogeneidad en las representaciones.

# covariables
ROSMAP_RINPMIAGESEX_covs <- readRDS("~/Library/CloudStorage/OneDrive-UNIVERSIDADDEMURCIA/Documentos/Fernando/Master Bioinformatica/TFM/datos/ROSMAP_RINPMIAGESEX_covs.rds")
covs <- ROSMAP_RINPMIAGESEX_covs
rownames(covs) <- covs$mrna_id
covs$study <- as.factor(covs$study)
covs$projid <- as.character(covs$projid)
covs$ceradsc <- as.factor(covs$ceradsc)
covs$cogdx <- as.factor(covs$cogdx)
covs$neuroStatus <- as.factor(covs$neuroStatus)

#datos corregidos 
ROSMAP_RINPMIAGESEX_resids <- readRDS("~/Library/CloudStorage/OneDrive-UNIVERSIDADDEMURCIA/Documentos/Fernando/Master Bioinformatica/TFM/datos/ROSMAP_RINPMIAGESEX_resids.rds")
mat_exp <- scale(ROSMAP_RINPMIAGESEX_resids)

One-hot encoding de la matriz de covariables

covs2 <- data.frame(matrix(ncol = 0, nrow = nrow(covs))) # Hago un dataframe vacio para meter los datos procesados

for (colname in names(covs)) {
  if (is.factor(covs[[colname]]) & length(levels(covs[[colname]])) > 2) {

    dummy_df <- dummy_cols(covs[colname],
                           remove_selected_columns = TRUE) # quitar las variables iniciales
    
    dummy_df <- data.frame(lapply(dummy_df, factor))

    covs2 <- cbind(covs2, dummy_df)

  } else {

    covs2[[colname]] <- covs[[colname]] # si son numéricas o de otra clase, las añadimos igual
  }
}
rownames(covs2) <- covs2$mrna_id

2 Cargando geneset

# geneset de Alzheimer extraidos de KEGG

tab <- getGeneKEGGLinks(species="hsa")
tab$Symbol <- mapIds(org.Hs.eg.db, tab$GeneID,
                       column="SYMBOL", keytype="ENTREZID")

paths <- getKEGGPathwayNames(species="hsa")
geneset_alz <- tab$Symbol[tab$PathwayID=="hsa05010"]

3 Mecanismos de filtrado

Filtrar la matriz Mnxm a Mnxm’, con m’ << n para filtrar los predictores, en este caso de la matriz de expresión mat_exp.

mat_exp_alz_genes <- mat_exp[, colnames(mat_exp) %in% geneset_alz]
venn.plot <- venn.diagram(
  x = list(GenesMatriz = colnames(mat_exp), GenesetAlzheimer = geneset_alz),
  category.names = c("Matrix Genes", "Geneset Alzheimer"),
  filename = NULL,
  output = FALSE,  # Asegura que no se exporte a un archivo
  fill = c("#440154ff", '#fde725ff'),
  cex = 1,  # Aumenta el tamaño del texto
  fontface = "bold",
  cat.cex = 1,  # Aumenta el tamaño del texto de las categorías
  cat.fontface = "bold",
  cat.default.pos = "text",
  cat.pos = 25, #posicion de las categoricas
  cat.dist = 0.1, #distancia de las categoricas
  rotation.degree = 0, 
  margin = 0.1, # hacerla más pequeña
  lwd=0.5,
  lty = "dashed", # Estilo de línea discontinua
  edge.col = "grey", # Color de los bordes
  main = "Alzheimer genes in exp matrix",
  main.fontface= "bold", 
  main.cex = 2,
  main.pos = c(0.5, 1)
)

grid.newpage()  # Asegura que el lienzo esté limpio
grid.draw(venn.plot)

4 Mecanismo de detección de outliers

4.1 PCA

pca_KEGG <- prcomp(mat_exp_alz_genes)
# Scree plot con los datos escalados
var_exp <- pca_KEGG$sdev^2
prop_var_exp <- var_exp / sum(var_exp)
cum_var_exp <- cumsum(prop_var_exp)

df_var_exp <- data.frame(Comp = 1:length(prop_var_exp), VarExp = prop_var_exp)
df_cum_var_exp <- data.frame(Comp = 1:length(cum_var_exp), CumVarExp = cum_var_exp)

ggplot(df_var_exp[1:20,], aes(x = Comp, y = VarExp)) +
  geom_bar(stat = "identity", fill = "skyblue") +
  geom_line(aes(group = 1), color = "blue") +
  geom_point(color = "blue") +
  theme_minimal() +
  labs(x = "Principal components", y = "Variance", title = "Scree Plot scaled") +
  ylim(c(0,1)) +
  geom_line(data = df_cum_var_exp[1:20,], aes(x = Comp, y = CumVarExp), color="#8B1A1A") +
  geom_point(data = df_cum_var_exp[1:20,], aes(x = Comp, y = CumVarExp), color = "red") +
  geom_bar(data = df_cum_var_exp[1:20,], aes(x = Comp, y = CumVarExp), stat = "identity", fill = "red", alpha= 0.25) +
  annotate("text", x = 4, y = 0.85, label = "Cumulative Scree Plot", color = "#8B1A1A", size = 4) +
  geom_text(data = df_cum_var_exp[seq(0,20,2),], aes(x=Comp, y = CumVarExp +0.04, label = round(CumVarExp, 2)))

4.1.1 Seleccionar muestras extremas para las dos primeras PC

Selecciono los outliers de las dos primeras PC con el doble de SD.

outlierspc1 <- as.data.frame(pca_KEGG$x[abs(pca_KEGG$x[,1]) > 1.5*(pca_KEGG$sdev[1]),1])

outlierspc2 <- as.data.frame(pca_KEGG$x[abs(pca_KEGG$x[,2]) > 1.5*(pca_KEGG$sdev[2]),2])
df <- as.data.frame(pca_KEGG$x)

plot1 <- ggplot(df, aes(x = PC1)) +
  geom_density(fill = "#00CED1", alpha = 0.5) +
  geom_vline(xintercept = mean(df$PC1) + 1.5*pca_KEGG$sdev[1], linetype = "dashed", color = "blue") +
  geom_vline(xintercept = mean(df$PC1) - 1.5*pca_KEGG$sdev[1], linetype = "dashed", color = "blue") +
  labs(title = "PC1 density with 1.5*SD scaled PCA") +
  geom_text(data = outlierspc1, 
            aes(x = outlierspc1[,1], y= 0, label = rownames(outlierspc1)), 
            vjust = 1.5, hjust = 0, size = 3, color = "#E9965A", angle = 90, fontface= "bold") +
  geom_rug(data = as.data.frame(pca_KEGG$x), aes(x= PC1, y = 0), color= ifelse(abs(pca_KEGG$x[,1]) > 1.5*(pca_KEGG$sdev[1]), "#8B1A1A", "grey")) +
  theme_minimal()

plot2 <- ggplot(df, aes(x = PC2)) +
  geom_density(fill = "#00CED1", alpha = 0.5) +
  geom_vline(xintercept = mean(df$PC2) + 1.5*pca_KEGG$sdev[2], linetype = "dashed", color = "blue") +
  geom_vline(xintercept = mean(df$PC2) - 1.5*pca_KEGG$sdev[2], linetype = "dashed", color = "blue") +
  labs(title = "PC2 density with 1.5*SD scaled PCA") +
  geom_text(data = outlierspc2, 
            aes(x = outlierspc2[,1], y= 0, label = rownames(outlierspc2)), 
            vjust = 1.5, hjust = 0, size = 3, color = "#E9965A", angle = 90, fontface= "bold") +
  geom_rug(data = as.data.frame(pca_KEGG$x), aes(x= PC2, y = 0), color= ifelse(abs(pca_KEGG$x[,2]) > 1.5*(pca_KEGG$sdev[2]), "#8B1A1A", "grey")) +
  theme_minimal()

grid.arrange(plot1, plot2, ncol = 1)

mPC1.pos <- rownames(outlierspc1[outlierspc1[, 1] > 0 , , drop = F])
mPC1.neg <- rownames(outlierspc1[outlierspc1[, 1] < 0 , , drop = F ])

mPC2.pos <- rownames(outlierspc2[outlierspc2[, 1] > 0 , , drop = F ])
mPC2.neg <- rownames(outlierspc2[outlierspc2[, 1] < 0 , , drop = F])

Tabla outliers PCA

# Asegurarse de que todos son vectores del mismo largo para el dataframe final
max_length <- max(length(mPC1.pos),
                  length(mPC1.neg),
                  length(mPC2.pos), 
                  length(mPC2.neg))

# Normalizar la longitud de los vectores (en caso de que alguno sea más corto)
rownames_mPC1.pos <- c(mPC1.pos, rep("", max_length - length(mPC1.pos)))
rownames_mPC1.neg <- c(mPC1.neg, rep("", max_length - length(mPC1.neg)))
rownames_mPC2.pos <- c(mPC2.pos, rep("", max_length - length(mPC2.pos)))
rownames_mPC2.neg <- c(mPC2.neg, rep("", max_length - length(mPC2.neg)))


final_table <- data.frame(
  mPC1.pos = rownames_mPC1.pos,
  mPC1.neg = rownames_mPC1.neg,
  mPC2.pos = rownames_mPC2.pos,
  mPC2.neg = rownames_mPC2.neg
)

final_table

4.2 tSNE

set.seed(1234)
tsne <- Rtsne(mat_exp_alz_genes, dims = 2, theta = 0.0)

tsne.data <- as.data.frame(tsne$Y)
row.names(tsne.data) <- row.names(mat_exp_alz_genes)
tsne.data.covs <- merge(tsne.data, covs, by = "row.names")
tsne.data.covs$Row.names <- NULL
row.names(tsne.data.covs) <- tsne.data.covs$mrna_id

4.2.0.1 Seleccionamos outliers

rownames(tsne.data) <- rownames(mat_exp_alz_genes)

# Calcular la media y la desviación estándar para cada componente de t-SNE
mean.tsne1 <- mean(tsne.data[,1])
sd.tsne1 <- sd(tsne.data[,1])
mean.tsne2 <- mean(tsne.data[,2])
sd.tsne2 <- sd(tsne.data[,2])

# Identificar muestras a más de 2 desviaciones estándar de la media
outliers.tsne1 <- tsne.data[abs(tsne.data[,1] - mean.tsne1) > 1.5 * sd.tsne1, ]
outliers.tsne2 <-tsne.data[abs(tsne.data[,2] - mean.tsne2) > 1.5 * sd.tsne2, ]

mtSNE1.pos <- rownames(outliers.tsne1[outliers.tsne1[, 1] > 0, , drop = F])

mtSNE1.neg <- rownames(outliers.tsne1[outliers.tsne1[, 1] < 0, , drop = F])

mtSNE2.pos <- rownames(outliers.tsne2[outliers.tsne2[, 1] > 0, , drop = F])

mtSNE2.neg <- rownames(outliers.tsne2[outliers.tsne2[, 1] < 0, , drop = F])

Tabla outliers tSNE

# Asegurarse de que todos son vectores del mismo largo para el dataframe final
max_length <- max(length(mtSNE1.pos),
                  length(mtSNE1.neg),
                  length(mtSNE2.pos), 
                  length(mtSNE2.neg))

# Normalizar la longitud de los vectores (en caso de que alguno sea más corto)
rownames_mtSNE1.pos <- c(mtSNE1.pos, rep("", max_length - length(mtSNE1.pos)))
rownames_mtSNE1.neg <- c(mtSNE1.neg, rep("", max_length - length(mtSNE1.neg)))
rownames_mtSNE2.pos <- c(mtSNE2.pos, rep("", max_length - length(mtSNE2.pos)))
rownames_mtSNE2.neg <- c(mtSNE2.neg, rep("", max_length - length(mtSNE2.neg)))


final_table <- data.frame(
  mtSNE1.pos = rownames_mtSNE1.pos,
  mtSNE1.neg = rownames_mtSNE1.neg,
  mtSNE2.pos = rownames_mtSNE2.pos,
  mtSNE2.neg = rownames_mtSNE2.neg
)

final_table

4.3 UMAP

local.config <- umap.defaults
# local.config$n_neighbors <- 4
# local.config$n_components <- 2
# local.config$n_epochs <- 100
# local.config$metric<- "euclidean"
set.seed(1234)
umap.ad <- umap(mat_exp_alz_genes,random_stage=1234, local.config)

umap.data <- as.data.frame(umap.ad$layout)
row.names(umap.data) <- row.names(mat_exp_alz_genes)
umap.data.covs <- merge(umap.data, covs, by = "row.names")
umap.data.covs$Row.names <- NULL
row.names(umap.data.covs) <- umap.data.covs$mrna_id

4.3.0.1 Seleccionamos outliers

rownames(umap.data) <- rownames(mat_exp_alz_genes)

# Calcular la media y la desviación estándar para cada componente de t-SNE
mean.umap1 <- mean(umap.data[,1])
sd.umap1 <- sd(umap.data[,1])
mean.umap2 <- mean(umap.data[,2])
sd.umap2 <- sd(umap.data[,2])

# Identificar muestras a más de 2 desviaciones estándar de la media
outliers.umap1 <- umap.data[abs(umap.data[,1] - mean.umap1) > 1.5 * sd.umap1, ]
outliers.umap2 <-umap.data[abs(umap.data[,2] - mean.umap2) > 1.5 * sd.umap2, ]

mUMAP1.pos <- rownames(outliers.umap1[outliers.umap1[,1] > 0 , , drop = F])

mUMAP1.neg <- rownames(outliers.umap1[outliers.umap1[,1] < 0 , , drop = F])

mUMAP2.pos <- rownames(outliers.umap2[outliers.umap2[,1] > 0 , , drop = F])

mUMAP2.neg <- rownames(outliers.umap2[outliers.umap2[,1] < 0 , , drop = F])

Tabla outliers UMAP

# Asegurarse de que todos son vectores del mismo largo para el dataframe final
max_length <- max(length(mUMAP1.pos),
                  length(mUMAP1.neg),
                  length(mUMAP2.pos), 
                  length(mUMAP2.neg))

# Normalizar la longitud de los vectores (en caso de que alguno sea más corto)
rownames_mtSNE1.pos <- c(mUMAP1.pos, rep("", max_length - length(mUMAP1.pos)))
rownames_mtSNE1.neg <- c(mUMAP1.neg, rep("", max_length - length(mUMAP1.neg)))
rownames_mtSNE2.pos <- c(mUMAP2.pos, rep("", max_length - length(mUMAP2.pos)))
rownames_mtSNE2.neg <- c(mUMAP2.neg, rep("", max_length - length(mUMAP2.neg)))


final_table <- data.frame(
  mUMAP1.pos = rownames_mtSNE1.pos,
  mUMAP1.neg = rownames_mtSNE1.neg,
  mUMAP2.pos = rownames_mtSNE2.pos,
  mUMAP2.neg = rownames_mtSNE2.neg
)

final_table

5 Mecanismo de anotación basado en diferencias

5.1 Covariables en outliers

Tabla frecuencia outliers SANOS Y ENFERMOS con PCA

# PCA

for (i in rownames(covs2)){
  if (i %in% mPC1.pos){
    covs2[i, "sampleset_PCA"] <- "mPC1 positive"
  }
  else if (i %in% mPC1.neg){
    covs2[i, "sampleset_PCA"] <- "mPC1 negative"
  }
  else if (i %in% mPC2.pos ){
    covs2[i, "sampleset_PCA"] <- "mPC2 positive"
  }
  else if (i %in% mPC2.neg){
    covs2[i, "sampleset_PCA"] <- "mPC2 negative"
  }
  else {
    covs2[i, "sampleset_PCA"] <- "Not in both"
  }
}

covs2$sampleset_PCA <- as.factor(covs2$sampleset_PCA)
data.frame(table(covs2$sampleset_PCA))

Tabla frecuencia outliers SOLAMENTE ENFERMOS con PCA

data.frame(table(covs2$sampleset_PCA[covs2$neuroStatus == 1]))

Tabla frecuencia outliers SANOS Y ENFERMOS con tSNE

#tsne

for (i in rownames(covs2)){
  if (i %in% mtSNE1.pos){
    covs2[i, "sampleset_tSNE"] <- "mtSNE1 positive"
  }
  else if (i %in% mtSNE1.neg){
    covs2[i, "sampleset_tSNE"] <- "mtSNE1 negative"
  }
  else if (i %in% mtSNE2.pos){
    covs2[i, "sampleset_tSNE"] <- "mtSNE2 positive"
  }
  else if (i %in% mtSNE2.neg){
    covs2[i, "sampleset_tSNE"] <- "mtSNE2 negative"
  } else {
    covs2[i, "sampleset_tSNE"] <- "Not in both"
  }
}

covs2$sampleset_tSNE <- as.factor(covs2$sampleset_tSNE)
data.frame(table(covs2$sampleset_tSNE))

Tabla frecuencia outliers SOLAMENTE ENFERMOS con tSNE

data.frame(table(covs2$sampleset_tSNE[covs2$neuroStatus == 1]))

Tabla frecuencia outliers SANOS Y ENFERMOS con UMAP

# UMAP

for (i in rownames(covs2)){
  if (i %in% mUMAP1.pos){
    covs2[i, "sampleset_UMAP"] <- "mUMAP1 positive"
  }
  else if (i %in% mUMAP1.neg){
    covs2[i, "sampleset_UMAP"] <- "mUMAP1 negative"
  }
  else if (i %in% mUMAP2.pos){
    covs2[i, "sampleset_UMAP"] <- "mUMAP2 positive"
  }
  else if (i %in% mUMAP2.neg){
    covs2[i, "sampleset_UMAP"] <- "mUMAP2 negative"
  } else {
    covs2[i, "sampleset_UMAP"] <- "Not in both"
  }
}

covs2$sampleset_UMAP <- as.factor(covs2$sampleset_UMAP)
data.frame(table(covs2$sampleset_UMAP))

Tabla frecuencia outliers SOLAMENTE ENFERMOS con UMAP

data.frame(table(covs2$sampleset_UMAP[covs2$neuroStatus == 1]))

5.2 Fitrado de casos

covs2.casos <- covs2 %>%
  filter(neuroStatus == 1)
reactable(covs2.casos)

6 Mecanismo de anotación basado en diferencias

Voy a comparar en una tabla los ratios de las covariables de los outliers comparando con los ratios de covariables de los no outliers

calculo.ratios <- function(df.covs){
  ratios <- data.frame()
  ratios$ratios <- numeric(0)
  
  for (i in names(df.covs)) {
    if (class(covs2[[i]]) == "factor") {
      
      if (grepl("sampleset_", i) | grepl("batch", i)){
        next
        
      } else {
        frecuencia <- table(df.covs[[i]])
  
        if (length(frecuencia) == 2) {
          ratio <- frecuencia[2] / frecuencia[1]
          nuevafila <- data.frame(ratios = ratio)
          row.names(nuevafila) <- i
          ratios <- rbind(ratios, nuevafila)
          
        } 
      } 
    } 
  }
  
  return(ratios)
}
data.frame.ratios <- function(df, sampleset) {
  niveles.sampleset <- levels(df[[sampleset]])

  ratios <- data.frame(matrix(ncol = 0, nrow = length(rownames(calculo.ratios(df)))))
  
  for (i in niveles.sampleset) {
    ratio <- df[df[[sampleset]] == i, ] %>%
      calculo.ratios()
    
    nombre.filas <- rownames(ratio)
    
    colnames(ratio)[1] <- i
    
    ratios <- cbind(ratios, ratio)
    rownames(ratios) <- nombre.filas  
  }
  return(ratios)
}

6.1 PCA

6.1.1 CASOS

ratios.PCA.kegg <- data.frame.ratios(covs2.casos, "sampleset_PCA")
ratio.ouliers.vs.no.ouliers <- data.frame(matrix(ncol = 0, nrow = length(rownames(ratios.PCA.kegg))))

for (i in colnames(ratios.PCA.kegg)) {
  if (i == "Not in both") {
    next
    
  } else {
    ratio <- data.frame(log2(ratios.PCA.kegg[[i]] / ratios.PCA.kegg[["Not in both"]]))
    
    nombre.filas <- rownames(ratios.PCA.kegg)
    
    colnames(ratio)[1] <- paste(i, " vs No outliers")
    
    ratio.ouliers.vs.no.ouliers <- cbind(ratio.ouliers.vs.no.ouliers, ratio)
    rownames(ratio.ouliers.vs.no.ouliers) <- nombre.filas  
  }
}
ratio.ouliers.vs.no.ouliers <- round(ratio.ouliers.vs.no.ouliers, 2)
ratio.ouliers.vs.no.ouliers <- ratio.ouliers.vs.no.ouliers %>%
  filter(row_number() != which(rownames(ratio.ouliers.vs.no.ouliers) == "neuroStatus"))
ratio.ouliers.vs.no.ouliers <- replace(ratio.ouliers.vs.no.ouliers, is.na(ratio.ouliers.vs.no.ouliers), "sin casos")

ratio outliers vs no outliers

reactable(ratio.ouliers.vs.no.ouliers)
reactable(data.frame(t(ratio.ouliers.vs.no.ouliers)))

6.1.1.1 Test estadistico

calculo.test <- function(df.covs, sampleset){
  
  resultados <- data.frame(
    "Covariable" = character(0), 
    "Principal component" = character(0),
    "Minimun expected frequency" = character(0),
    "test" = character(0),
    "p-value" = numeric(0),
    "test 2" = character(0),
    "p-value 2" = numeric(0),
    "Real_mPC_0" = numeric(0),
    "Expected_mPC_0" = numeric(0),
    "Real_mPC_1" = numeric(0),
    "Expected_mPC_1" = numeric(0),
    "Real_NOT_mPC_0" = numeric(0),
    "Expected_NOT_mPC_0" = numeric(0),
    "Real_NOT_mPC_1" = numeric(0),
    "Expected_NOT mPC_1" = numeric(0),
    stringsAsFactors = FALSE
  )
  
  for (i in names(df.covs)) {
    
    if (i == "neuroStatus") {
      next
      
    }
    if (class(df.covs[[i]]) == "factor") {
      
      if (grepl("sampleset_", i) | grepl("batch", i)){
        next
        
      } else {
        
        for (j in levels(df.covs[[sampleset]])) {
          
          if (j == "Not in both") {
            next
            
          } else {
            df <- df.covs %>%
              mutate(modified.class = ifelse(!!sym(sampleset) == j, j, paste("Not", j))) %>%
              dplyr::select(all_of(i), modified.class) %>%
              mutate(class = factor(modified.class, levels = c(j, paste("Not", j))))
            
            frecuencia <- table(df[[i]], df[["class"]])
            
            rownames(frecuencia) <- c(paste(i, rownames(frecuencia), sep = "_"))
            
            expected <- chisq.test(frecuencia)$expected
            
            if (any(expected <= 5)) {
              # Obtener la posición del valor mínimo
              posicion_minimo <- which(frecuencia == min(frecuencia), arr.ind = TRUE)
              nombres.columnas <- colnames(frecuencia)
              nombres.filas <- rownames(frecuencia)
              
              test <- fisher.test(frecuencia)
              test.utilizado <- "Fisher's exact test"
              
              test2 <- chisq.test(frecuencia, correct = TRUE)
              test2.utilizado <- "Chi-Square test with Yate's correction"
              
            } else {
              test <- chisq.test(frecuencia, correct = TRUE)
              test.utilizado <- "Chi-Square test with Yate's correction"
              
              test2 <- chisq.test(matrix(c(1,1,1,1), nrow = 2))
              test2.utilizado <- "N/A"
            }
            
            nuevafila <- data.frame(
              "Covariable" = i,
              "Principal component" = j,
              "Minimun expected frequency" = round(min(expected), 2),
              "test" = test.utilizado,
              "p-value" = round(test$p.value, 3),
              "test 2" = test2.utilizado,
              "p-value 2" = ifelse(test2.utilizado == "N/A", NA, round(test2$p.value, 3)),
              "Real_mPC_0" = round(frecuencia[1,1], 2),
              "Expected_mPC_0" = round(expected[1,1], 2),
              "Real_mPC_1" = round(frecuencia[2,1], 2),
              "Expected_mPC_1" = round(expected[2,1], 2),
              "Real_NOT_mPC_0" = round(frecuencia[1,2], 2),
              "Expected_NOT_mPC_0" = round(expected[1,2], 2),
              "Real_NOT_mPC_1" = round(frecuencia[2,2], 2),
              "Expected_NOT mPC_1" = round(expected[2,2], 2),
              stringsAsFactors = FALSE
            )
            
            resultados <- rbind(resultados, nuevafila)
          }
        }
      } 
    } 
  } 
  
  return(resultados)
}

Calculo test estadistico

reactable(calculo.test(df.covs = covs2.casos, sampleset = "sampleset_PCA"))
tests <- calculo.test(df.covs = covs2.casos, sampleset = "sampleset_PCA") %>%
  filter(p.value <= 0.1) 
plot.pvalue.sig <- function(df, test.df) {
  
  plots <- list()
  
  for (i in 1:nrow(test.df)) {
    variable <- as.character(test.df[i, 1])
    component <- as.character(test.df[i, 2])
    Minimun.expected.frequency <- as.character(test.df[i, 3])
    test.used <- as.character(test.df[i, 4])
    p.value <- as.character(test.df[i, 5])
    
    df.filtered <- df %>%
      mutate(modified.class = ifelse(sampleset_PCA == component, component, paste("Not", component))) %>%
      dplyr::select(all_of(variable), modified.class) %>%
      mutate(class = factor(modified.class, levels = c(component, paste("Not", component))))
    
    p <- ggbarstats(
      data = df.filtered, 
      x = !!sym(variable), 
      y = class, 
      xlab = "", title = paste(component, variable), 
      results.subtitle = FALSE, 
      subtitle = paste0(test.used, " with p-value = ", p.value, ". \nMinimum expected freq: ", Minimun.expected.frequency))
    
    plots[[paste(variable, component)]] <- p
  }
  
  return(plots)
}

plots <- plot.pvalue.sig(df = covs2.casos, test.df = tests)

plot_list <- lapply(plots, ggplotGrob)

grid.arrange(grobs = plot_list, ncol = 2)

6.1.2 Casos y sanos

ratios.PCA.kegg <- data.frame.ratios(covs2, "sampleset_PCA")
ratio.ouliers.vs.no.ouliers <- data.frame(matrix(ncol = 0, nrow = length(rownames(ratios.PCA.kegg))))

for (i in colnames(ratios.PCA.kegg)) {
  if (i == "Not in both") {
    next
    
  } else {
    ratio <- data.frame(log2(ratios.PCA.kegg[[i]] / ratios.PCA.kegg[["Not in both"]]))
    
    nombre.filas <- rownames(ratios.PCA.kegg)
    
    colnames(ratio)[1] <- paste(i, " vs No outliers")
    
    ratio.ouliers.vs.no.ouliers <- cbind(ratio.ouliers.vs.no.ouliers, ratio)
    rownames(ratio.ouliers.vs.no.ouliers) <- nombre.filas  
  }
}
ratio.ouliers.vs.no.ouliers <- round(ratio.ouliers.vs.no.ouliers, 2)
ratio.ouliers.vs.no.ouliers <- ratio.ouliers.vs.no.ouliers %>%
  filter(row_number() != which(rownames(ratio.ouliers.vs.no.ouliers) == "neuroStatus"))
ratio.ouliers.vs.no.ouliers <- replace(ratio.ouliers.vs.no.ouliers, is.na(ratio.ouliers.vs.no.ouliers), "sin casos")

ratio outliers vs no outliers

reactable(ratio.ouliers.vs.no.ouliers)
reactable(data.frame(t(ratio.ouliers.vs.no.ouliers)))

6.1.2.1 Test estadistico

Calculo test estadistico

reactable(calculo.test(df.covs = covs2, sampleset = "sampleset_PCA"))
tests <- calculo.test(df.covs = covs2, sampleset = "sampleset_PCA") %>%
  filter(p.value <= 0.1) 
plots <- plot.pvalue.sig(df = covs2, test.df = tests)

plot_list <- lapply(plots, ggplotGrob)

grid.arrange(grobs = plot_list, ncol = 2)

6.2 tSNE

6.2.1 CASOS

ratios.tsne.kegg <- data.frame.ratios(covs2.casos, "sampleset_tSNE")
ratio.ouliers.vs.no.ouliers <- data.frame(matrix(ncol = 0, nrow = length(rownames(ratios.tsne.kegg))))

for (i in colnames(ratios.tsne.kegg)) {
  if (i == "Not in both") {
    next
    
  } else {
    ratio <- data.frame(log2(ratios.tsne.kegg[[i]] / ratios.tsne.kegg[["Not in both"]]))
    
    nombre.filas <- rownames(ratios.tsne.kegg)
    
    colnames(ratio)[1] <- paste(i, " vs No outliers")
    
    ratio.ouliers.vs.no.ouliers <- cbind(ratio.ouliers.vs.no.ouliers, ratio)
    rownames(ratio.ouliers.vs.no.ouliers) <- nombre.filas  
  }
}
ratio.ouliers.vs.no.ouliers <- round(ratio.ouliers.vs.no.ouliers, 2)
ratio.ouliers.vs.no.ouliers <- ratio.ouliers.vs.no.ouliers %>%
  filter(row_number() != which(rownames(ratio.ouliers.vs.no.ouliers) == "neuroStatus"))
ratio.ouliers.vs.no.ouliers <- replace(ratio.ouliers.vs.no.ouliers, is.na(ratio.ouliers.vs.no.ouliers), "sin casos")

ratio outliers vs no outliers

reactable(ratio.ouliers.vs.no.ouliers)
reactable(data.frame(t(ratio.ouliers.vs.no.ouliers)))

6.2.1.1 Test estadistico

Calculo test estadistico

reactable(calculo.test(df.covs = covs2.casos, sampleset = "sampleset_tSNE"))
tests <- calculo.test(df.covs = covs2.casos, sampleset = "sampleset_tSNE") %>%
  filter(p.value <= 0.1) 
plot.pvalue.sig <- function(df, test.df) {
  
  plots <- list()
  
  for (i in 1:nrow(test.df)) {
    variable <- as.character(test.df[i, 1])
    component <- as.character(test.df[i, 2])
    Minimun.expected.frequency <- as.character(test.df[i, 3])
    test.used <- as.character(test.df[i, 4])
    p.value <- as.character(test.df[i, 5])
    
    df.filtered <- df %>%
      mutate(modified.class = ifelse(sampleset_tSNE == component, component, paste("Not", component))) %>%
      dplyr::select(all_of(variable), modified.class) %>%
      mutate(class = factor(modified.class, levels = c(component, paste("Not", component))))
    
    p <- ggbarstats(
      data = df.filtered, 
      x = !!sym(variable), 
      y = class, 
      xlab = "", title = paste(component, variable), 
      results.subtitle = FALSE, 
      subtitle = paste0(test.used, " with p-value = ", p.value, ". \nMinimum expected freq: ", Minimun.expected.frequency))
    
    plots[[paste(variable, component)]] <- p
  }
  
  return(plots)
}

plots <- plot.pvalue.sig(df = covs2.casos, test.df = tests)

plot_list <- lapply(plots, ggplotGrob)

grid.arrange(grobs = plot_list, ncol = 2)

6.2.2 Casos y sanos

ratios.tSNE.kegg <- data.frame.ratios(covs2, "sampleset_tSNE")
ratio.ouliers.vs.no.ouliers <- data.frame(matrix(ncol = 0, nrow = length(rownames(ratios.tSNE.kegg))))

for (i in colnames(ratios.tSNE.kegg)) {
  if (i == "Not in both") {
    next
    
  } else {
    ratio <- data.frame(log2(ratios.tSNE.kegg[[i]] / ratios.tSNE.kegg[["Not in both"]]))
    
    nombre.filas <- rownames(ratios.tSNE.kegg)
    
    colnames(ratio)[1] <- paste(i, " vs No outliers")
    
    ratio.ouliers.vs.no.ouliers <- cbind(ratio.ouliers.vs.no.ouliers, ratio)
    rownames(ratio.ouliers.vs.no.ouliers) <- nombre.filas  
  }
}
ratio.ouliers.vs.no.ouliers <- round(ratio.ouliers.vs.no.ouliers, 2)
ratio.ouliers.vs.no.ouliers <- ratio.ouliers.vs.no.ouliers %>%
  filter(row_number() != which(rownames(ratio.ouliers.vs.no.ouliers) == "neuroStatus"))
ratio.ouliers.vs.no.ouliers <- replace(ratio.ouliers.vs.no.ouliers, is.na(ratio.ouliers.vs.no.ouliers), "sin casos")

ratio outliers vs no outliers

reactable(ratio.ouliers.vs.no.ouliers)
reactable(data.frame(t(ratio.ouliers.vs.no.ouliers)))

6.2.2.1 Test estadistico

Calculo test estadistico

reactable(calculo.test(df.covs = covs2, sampleset = "sampleset_tSNE"))
tests <- calculo.test(df.covs = covs2, sampleset = "sampleset_tSNE") %>%
  filter(p.value <= 0.1) 
plot.pvalue.sig <- function(df, test.df) {
  
  plots <- list()
  
  for (i in 1:nrow(test.df)) {
    variable <- as.character(test.df[i, 1])
    component <- as.character(test.df[i, 2])
    Minimun.expected.frequency <- as.character(test.df[i, 3])
    test.used <- as.character(test.df[i, 4])
    p.value <- as.character(test.df[i, 5])
    
    df.filtered <- df %>%
      mutate(modified.class = ifelse(sampleset_tSNE == component, component, paste("Not", component))) %>%
      dplyr::select(all_of(variable), modified.class) %>%
      mutate(class = factor(modified.class, levels = c(component, paste("Not", component))))
    
    p <- ggbarstats(
      data = df.filtered, 
      x = !!sym(variable), 
      y = class, 
      xlab = "", title = paste(component, variable), 
      results.subtitle = FALSE, 
      subtitle = paste0(test.used, " with p-value = ", p.value, ". \nMinimum expected freq: ", Minimun.expected.frequency))
    
    plots[[paste(variable, component)]] <- p
  }
  
  return(plots)
}

plots <- plot.pvalue.sig(df = covs2, test.df = tests)

plot_list <- lapply(plots, ggplotGrob)

grid.arrange(grobs = plot_list, ncol = 2)

6.3 UMAP

6.3.1 CASOS

ratios.umap.kegg <- data.frame.ratios(covs2.casos, "sampleset_UMAP")
ratio.ouliers.vs.no.ouliers <- data.frame(matrix(ncol = 0, nrow = length(rownames(ratios.umap.kegg))))

for (i in colnames(ratios.umap.kegg)) {
  if (i == "Not in both") {
    next
    
  } else {
    ratio <- data.frame(log2(ratios.umap.kegg[[i]] / ratios.umap.kegg[["Not in both"]]))
    
    nombre.filas <- rownames(ratios.umap.kegg)
    
    colnames(ratio)[1] <- paste(i, " vs No outliers")
    
    ratio.ouliers.vs.no.ouliers <- cbind(ratio.ouliers.vs.no.ouliers, ratio)
    rownames(ratio.ouliers.vs.no.ouliers) <- nombre.filas  
  }
}
ratio.ouliers.vs.no.ouliers <- round(ratio.ouliers.vs.no.ouliers, 2)
ratio.ouliers.vs.no.ouliers <- ratio.ouliers.vs.no.ouliers %>%
  filter(row_number() != which(rownames(ratio.ouliers.vs.no.ouliers) == "neuroStatus"))
ratio.ouliers.vs.no.ouliers <- replace(ratio.ouliers.vs.no.ouliers, is.na(ratio.ouliers.vs.no.ouliers), "sin casos")

ratio outliers vs no outliers

reactable(ratio.ouliers.vs.no.ouliers)
reactable(data.frame(t(ratio.ouliers.vs.no.ouliers)))

6.3.1.1 Test estadistico

Calculo test estadistico

reactable(calculo.test(df.covs = covs2.casos, sampleset = "sampleset_UMAP"))
tests <- calculo.test(df.covs = covs2.casos, sampleset = "sampleset_UMAP") %>%
  filter(p.value <= 0.1) 
plot.pvalue.sig <- function(df, test.df) {
  
  plots <- list()
  
  for (i in 1:nrow(test.df)) {
    variable <- as.character(test.df[i, 1])
    component <- as.character(test.df[i, 2])
    Minimun.expected.frequency <- as.character(test.df[i, 3])
    test.used <- as.character(test.df[i, 4])
    p.value <- as.character(test.df[i, 5])
    
    df.filtered <- df %>%
      mutate(modified.class = ifelse(sampleset_UMAP == component, component, paste("Not", component))) %>%
      dplyr::select(all_of(variable), modified.class) %>%
      mutate(class = factor(modified.class, levels = c(component, paste("Not", component))))
    
    p <- ggbarstats(
      data = df.filtered, 
      x = !!sym(variable), 
      y = class, 
      xlab = "", title = paste(component, variable), 
      results.subtitle = FALSE, 
      subtitle = paste0(test.used, " with p-value = ", p.value, ". \nMinimum expected freq: ", Minimun.expected.frequency))
    
    plots[[paste(variable, component)]] <- p
  }
  
  return(plots)
}

plots <- plot.pvalue.sig(df = covs2.casos, test.df = tests)

plot_list <- lapply(plots, ggplotGrob)

grid.arrange(grobs = plot_list, ncol = 2)

6.3.2 Casos y sanos

ratios.umap.kegg <- data.frame.ratios(covs2, "sampleset_UMAP")
ratio.ouliers.vs.no.ouliers <- data.frame(matrix(ncol = 0, nrow = length(rownames(ratios.umap.kegg))))

for (i in colnames(ratios.umap.kegg)) {
  if (i == "Not in both") {
    next
    
  } else {
    ratio <- data.frame(log2(ratios.umap.kegg[[i]] / ratios.umap.kegg[["Not in both"]]))
    
    nombre.filas <- rownames(ratios.umap.kegg)
    
    colnames(ratio)[1] <- paste(i, " vs No outliers")
    
    ratio.ouliers.vs.no.ouliers <- cbind(ratio.ouliers.vs.no.ouliers, ratio)
    rownames(ratio.ouliers.vs.no.ouliers) <- nombre.filas  
  }
}
ratio.ouliers.vs.no.ouliers <- round(ratio.ouliers.vs.no.ouliers, 2)
ratio.ouliers.vs.no.ouliers <- ratio.ouliers.vs.no.ouliers %>%
  filter(row_number() != which(rownames(ratio.ouliers.vs.no.ouliers) == "neuroStatus"))
ratio.ouliers.vs.no.ouliers <- replace(ratio.ouliers.vs.no.ouliers, is.na(ratio.ouliers.vs.no.ouliers), "sin casos")

ratio outliers vs no outliers

reactable(ratio.ouliers.vs.no.ouliers)
reactable(data.frame(t(ratio.ouliers.vs.no.ouliers)))

6.3.2.1 Test estadistico

Calculo test estadistico

reactable(calculo.test(df.covs = covs2, sampleset = "sampleset_UMAP"))
tests <- calculo.test(df.covs = covs2, sampleset = "sampleset_UMAP") %>%
  filter(p.value <= 0.1) 
plot.pvalue.sig <- function(df, test.df) {
  
  plots <- list()
  
  for (i in 1:nrow(test.df)) {
    variable <- as.character(test.df[i, 1])
    component <- as.character(test.df[i, 2])
    Minimun.expected.frequency <- as.character(test.df[i, 3])
    test.used <- as.character(test.df[i, 4])
    p.value <- as.character(test.df[i, 5])
    
    df.filtered <- df %>%
      mutate(modified.class = ifelse(sampleset_UMAP == component, component, paste("Not", component))) %>%
      dplyr::select(all_of(variable), modified.class) %>%
      mutate(class = factor(modified.class, levels = c(component, paste("Not", component))))
    
    p <- ggbarstats(
      data = df.filtered, 
      x = !!sym(variable), 
      y = class, 
      xlab = "", title = paste(component, variable), 
      results.subtitle = FALSE, 
      subtitle = paste0(test.used, " with p-value = ", p.value, ". \nMinimum expected freq: ", Minimun.expected.frequency))
    
    plots[[paste(variable, component)]] <- p
  }
  
  return(plots)
}
plots <- plot.pvalue.sig(df = covs2, test.df = tests)

plot_list <- lapply(plots, ggplotGrob)

if (length(plot_list) > 4) {
  grid.arrange(grobs = c(plot_list[1], plot_list[2], plot_list[3], plot_list[4]), ncol = 2)
  grid.arrange(grobs = c(plot_list[5], plot_list[6], plot_list[7], plot_list[8]), ncol = 2)
  grid.arrange(grobs = c(plot_list[9], plot_list[10]), ncol = 2)
}

7 Mecanismo de anotación dependiente del mecanismo de detección

7.1 PCA

pca_rotation <- round(data.frame(pca_KEGG$rotation[,1:2]),3)

reactable(pca_rotation)
loadings <- pca_KEGG$rotation

loading.pc1.pos <- loadings[loadings[, "PC1"] > 0, "PC1"]
loading.pc1.neg <- loadings[loadings[, "PC1"] < 0, "PC1"]
loading.pc2.pos <- loadings[loadings[, "PC2"] > 0, "PC2"]
loading.pc2.neg <- loadings[loadings[, "PC2"] < 0, "PC2"]

best.loadings.pc1.pos <- data.frame(loading = head(sort(loading.pc1.pos, decreasing = TRUE), 5))
best.loadings.pc1.neg <- data.frame(loading = head(sort(loading.pc1.neg, decreasing = FALSE), 5), pc = "mPC1 negative")
best.loadings.pc2.pos <- data.frame(loading =head(sort(loading.pc2.pos, decreasing = TRUE), 5), pc = "mPC2 positive")
best.loadings.pc2.neg <- data.frame(loading =head(sort(loading.pc2.neg, decreasing = FALSE), 5), pc = "mPC2 negative")
pca.var <- get_pca_var(pca_KEGG)
pca.var.coord <- data.frame(pca.var$coord[,1:2])

pca.var.coord.best.loadings <-  pca.var.coord[(rownames(pca.var.coord) %in% rownames(best.loadings.pc1.pos)) | (rownames(pca.var.coord) %in% rownames(best.loadings.pc1.neg)) | (rownames(pca.var.coord) %in% rownames(best.loadings.pc2.pos)) | (rownames(pca.var.coord) %in% rownames(best.loadings.pc2.neg)), ]


pca.var.coord.best.loadings <- pca.var.coord.best.loadings %>%
  mutate(PC = case_when(
    row.names(.) %in% rownames(best.loadings.pc1.pos) ~ "mPC1 positive",
    row.names(.) %in% rownames(best.loadings.pc1.neg) ~ "mPC1 negative",
    row.names(.) %in% rownames(best.loadings.pc2.pos) ~ "mPC2 positive",
    row.names(.) %in% rownames(best.loadings.pc2.neg) ~ "mPC2 negative",
    TRUE ~ NA_character_  # para cualquier otra condición no especificada
  ))
pca.data.kegg <- data.frame(pca_KEGG$x)[,1:2]
colnames(pca.data.kegg) <- c("PC1.KEGG", "PC2.KEGG")

covs2 <- merge(covs2, pca.data.kegg, by="row.names")
rownames(covs2) <- covs2$mrna_id
covs2$Row.names <- NULL
pca.data.kegg <- data.frame(pca_KEGG$x)[,1:2]
colnames(pca.data.kegg) <- c("PC1.KEGG", "PC2.KEGG")

covs2_df <- covs2.casos %>%
  dplyr::select(mrna_id, sampleset_PCA)

para.plot <- merge(pca.data.kegg, covs2_df, by = "row.names")
rownames(para.plot) <- para.plot$Row.names
para.plot$Row.names <- NULL

pca.var.coord.best.loadings$id <- rownames(pca.var.coord.best.loadings)

p <- ggplot(para.plot, aes_string(x = "PC1.KEGG", y = "PC2.KEGG", color = "sampleset_PCA")) +
    geom_point(show.legend = TRUE, size = 3) +
    geom_text_repel(aes(label=ifelse(mrna_id %in% rownames(outlierspc1) | mrna_id %in% rownames(outlierspc2), as.character(gsub("_.*", "", covs2$mrna_id)), "")),
                  color = "black",
                  max.overlaps = 30, # Reduce el número máximo de solapamientos
                  point.padding = unit(0.2, "lines"), # Menos padding alrededor de los puntos
                  size = 3, 
                  fontface = "bold",
                  segment.size = 0.2, # Líneas de guía más finas
                  segment.color = 'grey50',
                  max.segment.length = unit(0.5, "lines"), # Líneas de guía más cortas
                  arrow = arrow(length = unit(0.02, "npc"), type = "closed", ends = "last")) +
    theme_minimal() +
    labs(title = "PCA plot loadings",
         x = "PC1.KEGG", y = "PC2.KEGG") +
    theme(
      legend.title = element_blank(),
      legend.text = element_text(size = 12),
      text = element_text(size = 12),
      legend.key.size = unit(0.5, "cm")) +
  geom_point(data= (covs2 %>% filter(neuroStatus == 0)), aes_string(x = "PC1.KEGG", y = "PC2.KEGG", color = "sampleset_PCA"), alpha = 0.5)+ 
  geom_segment(data = pca.var.coord.best.loadings, aes(x= 0, y = 0, xend = Dim.1*20, yend = Dim.2*20, color = PC), arrow = arrow(length = unit(0.05, "inches"), type = "closed"),
               size = 0.4) +
  geom_text_repel(data = pca.var.coord.best.loadings,
                    aes(x = Dim.1*22, 
                        y = Dim.2*22,
                        label=ifelse((id %in% rownames(best.loadings.pc1.pos) | id %in% rownames(best.loadings.pc1.neg) | id %in% rownames(best.loadings.pc2.pos) | id %in% rownames(best.loadings.pc2.neg)), id, ""),
                        color = PC),
                  max.overlaps = 10, # Reduce el número máximo de solapamientos
                  size = 3, 
                  fontface = "bold",
                  segment.color = 'grey50',
                  arrow = arrow(length = unit(0.02, "npc"), type = "closed", ends = "last")) 

p

7.1.1 Enriquecimiento

loading.pc1.pos.ordered <- data.frame(loading = sort(loading.pc1.pos, decreasing = TRUE))
loading.pc1.neg.ordered <- data.frame(loading = abs(sort(loading.pc1.neg, decreasing = F)))
loading.pc2.pos.ordered <- data.frame(loading = sort(loading.pc2.pos, decreasing = TRUE))
loading.pc2.neg.ordered <- data.frame(loading = abs(sort(loading.pc2.neg, decreasing = F)))

p1 <- loading.pc1.pos.ordered %>%
  rownames_to_column() %>%
  head(50) %>%
  mutate(row_index = row_number(),  
         rowname = forcats::fct_inorder(rowname)) %>%
  ggplot(aes(x = row_index, y = loading)) +  
    geom_point() +
    geom_line() +
    labs(x = "Número Genes", title = "PC1+") +
    theme_bw() +
    theme(axis.text.x = element_text(size = 10))

p2 <- loading.pc1.neg.ordered %>%
  rownames_to_column() %>%
  head(50) %>%
  mutate(row_index = row_number(),  
         rowname = forcats::fct_inorder(rowname)) %>%
  ggplot(aes(x = row_index, y = loading)) +  
    geom_point() +
    geom_line() +
    labs(x = "Número Genes", title = "PC1-") +
    theme_bw() +
    theme(axis.text.x = element_text(size = 10))

p3 <- loading.pc2.pos.ordered %>%
  rownames_to_column() %>%
  head(50) %>%
  mutate(row_index = row_number(),  
         rowname = forcats::fct_inorder(rowname)) %>%
  ggplot(aes(x = row_index, y = loading)) +  
    geom_point() +
    geom_line() +
    labs(x = "Número Genes", title = "PC2+") +
    theme_bw() +
    theme(axis.text.x = element_text(size = 10))

p4 <- loading.pc2.neg.ordered %>%
  rownames_to_column() %>%
  head(50) %>%
  mutate(row_index = row_number(),  
         rowname = forcats::fct_inorder(rowname)) %>%
  ggplot(aes(x = row_index, y = loading)) +  
    geom_point() +
    geom_line() +
    labs(x = "Número Genes", title = "PC2-") +
    theme_bw() +
    theme(axis.text.x = element_text(size = 10))

grid.arrange(p1, p2, p3, p4, ncol = 2)

7.1.1.1 GO

multi_gp = gost(list("PC1+" = row.names(head(loading.pc1.pos.ordered, 30)),
                     "PC1-" = row.names(head(loading.pc1.neg.ordered, 30)),
                     "PC2+" = row.names(head(loading.pc2.pos.ordered, 30)),
                     "PC2-" = row.names(head(loading.pc2.neg.ordered, 30))),
                organism = "hsapiens", 
                multi_query = T, 
                sources = "GO", 
                highlight = T
)
gostplot(multi_gp, interactive = T, capped = F)
loading.pc1.pos.ordered.converted = gconvert(row.names(head(loading.pc1.pos.ordered, 30)))
loading.pc1.neg.ordered.converted = gconvert(row.names(head(loading.pc1.neg.ordered, 30)))
loading.pc2.pos.ordered.converted = gconvert(row.names(head(loading.pc2.pos.ordered, 30)))
loading.pc2.neg.ordered.converted = gconvert(row.names(head(loading.pc2.neg.ordered, 30)))

# enrichment analysis using gene names
multi_gp = gost(list("PC1+" = loading.pc1.pos.ordered.converted$name,
                     "PC1-" = loading.pc1.neg.ordered.converted$name,
                     "PC2+" = loading.pc2.pos.ordered.converted$name,
                     "PC2-" = loading.pc2.neg.ordered.converted$name), 
          multi_query = FALSE, 
          #ordered_query = T,
          evcodes = TRUE, 
          sources = "GO",
          highlight = T)

# modify the g:Profiler data frame
gp_mod = multi_gp$result[,c("query", "source", "term_id",
                                "term_name", "p_value", "query_size", 
                                "intersection_size", "term_size", 
                                "effective_domain_size", "intersection")]
gp_mod$GeneRatio = paste0(gp_mod$intersection_size,  "/", gp_mod$query_size)
gp_mod$BgRatio = paste0(gp_mod$term_size, "/", gp_mod$effective_domain_size)
names(gp_mod) = c("Cluster", "Category", "ID", "Description", "p.adjust", 
                    "query_size", "Count", "term_size", "effective_domain_size", 
                    "geneID", "GeneRatio", "BgRatio")
gp_mod$geneID = gsub(",", "/", gp_mod$geneID)

#quitar los duplicados
duplicados <- gp_mod$ID[duplicated(gp_mod$ID) | duplicated(gp_mod$ID, fromLast = TRUE)]
gp_mod_unicos <- gp_mod[!(gp_mod$ID %in% duplicados), ]

row.names(gp_mod_unicos) <- gp_mod_unicos$ID

# define as compareClusterResult object
gp_mod_cluster = new("compareClusterResult", compareClusterResult = gp_mod_unicos)

# define as enrichResult object
gp_mod_enrich  = new("enrichResult", result = gp_mod_unicos)

enrichplot::dotplot(gp_mod_cluster)

7.1.1.2 KEGG

multi_gp = gost(list("PC1+" = row.names(head(loading.pc1.pos.ordered, 30)),
                     "PC1-" = row.names(head(loading.pc1.neg.ordered, 30)),
                     "PC2+" = row.names(head(loading.pc2.pos.ordered, 30)),
                     "PC2-" = row.names(head(loading.pc2.neg.ordered, 30))),
                organism = "hsapiens", 
                multi_query = T, sources = "KEGG"
)
gostplot(multi_gp, interactive = T, capped = F)
loading.pc1.pos.ordered.converted = gconvert(row.names(head(loading.pc1.pos.ordered, 30)))
loading.pc1.neg.ordered.converted = gconvert(row.names(head(loading.pc1.neg.ordered, 30)))
loading.pc2.pos.ordered.converted = gconvert(row.names(head(loading.pc2.pos.ordered, 30)))
loading.pc2.neg.ordered.converted = gconvert(row.names(head(loading.pc2.neg.ordered, 30)))

# enrichment analysis using gene names
multi_gp = gost(list("PC1+" = loading.pc1.pos.ordered.converted$name,
                     "PC1-" = loading.pc1.neg.ordered.converted$name,
                     "PC2+" = loading.pc2.pos.ordered.converted$name,
                     "PC2-" = loading.pc2.neg.ordered.converted$name), 
          multi_query = FALSE, evcodes = TRUE, sources = "KEGG")

# modify the g:Profiler data frame
gp_mod = multi_gp$result[,c("query", "source", "term_id",
                                "term_name", "p_value", "query_size", 
                                "intersection_size", "term_size", 
                                "effective_domain_size", "intersection")]
gp_mod$GeneRatio = paste0(gp_mod$intersection_size,  "/", gp_mod$query_size)
gp_mod$BgRatio = paste0(gp_mod$term_size, "/", gp_mod$effective_domain_size)
names(gp_mod) = c("Cluster", "Category", "ID", "Description", "p.adjust", 
                    "query_size", "Count", "term_size", "effective_domain_size", 
                    "geneID", "GeneRatio", "BgRatio")
gp_mod$geneID = gsub(",", "/", gp_mod$geneID)

#quitar los duplicados
duplicados <- gp_mod$ID[duplicated(gp_mod$ID) | duplicated(gp_mod$ID, fromLast = TRUE)]
gp_mod_unicos <- gp_mod[!(gp_mod$ID %in% duplicados), ]

row.names(gp_mod_unicos) <- gp_mod_unicos$ID

# define as compareClusterResult object
gp_mod_cluster = new("compareClusterResult", compareClusterResult = gp_mod_unicos)

# define as enrichResult object
gp_mod_enrich  = new("enrichResult", result = gp_mod_unicos)

enrichplot::dotplot(gp_mod_cluster)

7.1.1.3 REAC

multi_gp = gost(list("PC1+" = row.names(head(loading.pc1.pos.ordered, 30)),
                     "PC1-" = row.names(head(loading.pc1.neg.ordered, 30)),
                     "PC2+" = row.names(head(loading.pc2.pos.ordered, 30)),
                     "PC2-" = row.names(head(loading.pc2.neg.ordered, 30))),
                organism = "hsapiens", 
                ordered_query = T, 
                multi_query = T, sources = "REAC"
)
gostplot(multi_gp, interactive = T, capped = F)
loading.pc1.pos.ordered.converted = gconvert(row.names(head(loading.pc1.pos.ordered, 30)))
loading.pc1.neg.ordered.converted = gconvert(row.names(head(loading.pc1.neg.ordered, 30)))
loading.pc2.pos.ordered.converted = gconvert(row.names(head(loading.pc2.pos.ordered, 30)))
loading.pc2.neg.ordered.converted = gconvert(row.names(head(loading.pc2.neg.ordered, 30)))

# enrichment analysis using gene names
multi_gp = gost(list("PC1+" = loading.pc1.pos.ordered.converted$name,
                     "PC1-" = loading.pc1.neg.ordered.converted$name,
                     "PC2+" = loading.pc2.pos.ordered.converted$name,
                     "PC2-" = loading.pc2.neg.ordered.converted$name), 
          multi_query = FALSE, evcodes = TRUE, sources = "REAC")

# modify the g:Profiler data frame
gp_mod = multi_gp$result[,c("query", "source", "term_id",
                                "term_name", "p_value", "query_size", 
                                "intersection_size", "term_size", 
                                "effective_domain_size", "intersection")]
gp_mod$GeneRatio = paste0(gp_mod$intersection_size,  "/", gp_mod$query_size)
gp_mod$BgRatio = paste0(gp_mod$term_size, "/", gp_mod$effective_domain_size)
names(gp_mod) = c("Cluster", "Category", "ID", "Description", "p.adjust", 
                    "query_size", "Count", "term_size", "effective_domain_size", 
                    "geneID", "GeneRatio", "BgRatio")
gp_mod$geneID = gsub(",", "/", gp_mod$geneID)

#quitar los duplicados
duplicados <- gp_mod$ID[duplicated(gp_mod$ID) | duplicated(gp_mod$ID, fromLast = TRUE)]
gp_mod_unicos <- gp_mod[!(gp_mod$ID %in% duplicados), ]

row.names(gp_mod_unicos) <- gp_mod_unicos$ID

# define as compareClusterResult object
gp_mod_cluster = new("compareClusterResult", compareClusterResult = gp_mod_unicos)

# define as enrichResult object
gp_mod_enrich  = new("enrichResult", result = gp_mod_unicos)

enrichplot::dotplot(gp_mod_cluster)

7.2 tSNE

tsne.data <- data.frame(tsne$Y)[,1:2]
colnames(tsne.data) <- c("tsne1", "tsne2")
rownames(tsne.data) <- rownames(mat_exp_alz_genes)

covs2 <- merge(covs2, tsne.data, by="row.names")
rownames(covs2) <- covs2$mrna_id
covs2$Row.names <- NULL
tsne.data <- data.frame(tsne$Y)[,1:2]
colnames(tsne.data) <- c("tsne1", "tsne2")
rownames(tsne.data) <- rownames(mat_exp_alz_genes)

covs2_df <- covs2.casos %>%
  dplyr::select(mrna_id, sampleset_tSNE)

para.plot <- merge(tsne.data, covs2_df, by = "row.names")
rownames(para.plot) <- para.plot$Row.names
para.plot$Row.names <- NULL

ggplot(para.plot, aes_string(x = "tsne1", y = "tsne2", color = "sampleset_tSNE")) +
    geom_point(show.legend = TRUE, size = 3) +
    geom_text_repel(aes(label=ifelse(mrna_id %in% rownames(outliers.tsne1) | mrna_id %in% rownames(outliers.tsne2), as.character(gsub("_.*", "", covs2$mrna_id)), "")),
                  color = "black",
                  max.overlaps = 30, # Reduce el número máximo de solapamientos
                  point.padding = unit(0.2, "lines"), # Menos padding alrededor de los puntos
                  size = 3, 
                  fontface = "bold",
                  segment.size = 0.2, # Líneas de guía más finas
                  segment.color = 'grey50',
                  max.segment.length = unit(0.5, "lines"), # Líneas de guía más cortas
                  arrow = arrow(length = unit(0.02, "npc"), type = "closed", ends = "last")) +
    theme_minimal() +
    labs(title = "tSNE",
         x = "tsne1", y = "tsne2") +
    theme(
      legend.title = element_blank(),
      legend.text = element_text(size = 12),
      text = element_text(size = 12),
      legend.key.size = unit(0.5, "cm")) +
  geom_point(data= (covs2 %>% filter(neuroStatus == 0)), aes_string(x = "tsne1", y = "tsne2", color = "sampleset_tSNE"), alpha = 0.5) 

7.3 UMAP

umap.data <- data.frame(umap.ad$layout)[,1:2]
colnames(umap.data) <- c("umap1", "umap2")

covs2 <- merge(covs2, umap.data, by="row.names")
rownames(covs2) <- covs2$mrna_id
covs2$Row.names <- NULL
umap.data <- data.frame(umap.ad$layout)[,1:2]
colnames(umap.data) <- c("umap1", "umap2")

covs2_df <- covs2.casos %>%
  dplyr::select(mrna_id, sampleset_UMAP)

para.plot <- merge(umap.data, covs2_df, by = "row.names")
rownames(para.plot) <- para.plot$Row.names
para.plot$Row.names <- NULL

ggplot(para.plot, aes_string(x = "umap1", y = "umap2", color = "sampleset_UMAP")) +
    geom_point(show.legend = TRUE, size = 3) +
    geom_text_repel(aes(label=ifelse(mrna_id %in% rownames(outliers.umap1) | mrna_id %in% rownames(outliers.umap2), as.character(gsub("_.*", "", covs2$mrna_id)), "")),
                  color = "black",
                  max.overlaps = 30, # Reduce el número máximo de solapamientos
                  point.padding = unit(0.2, "lines"), # Menos padding alrededor de los puntos
                  size = 3, 
                  fontface = "bold",
                  segment.size = 0.2, # Líneas de guía más finas
                  segment.color = 'grey50',
                  max.segment.length = unit(0.5, "lines"), # Líneas de guía más cortas
                  arrow = arrow(length = unit(0.02, "npc"), type = "closed", ends = "last")) +
    theme_minimal() +
    labs(title = "UMAP",
         x = "umap1", y = "umap2") +
    theme(
      legend.title = element_blank(),
      legend.text = element_text(size = 12),
      text = element_text(size = 12),
      legend.key.size = unit(0.5, "cm")) +
  geom_point(data= (covs2 %>% filter(neuroStatus == 0)), aes_string(x = "umap1", y = "umap2", color = "sampleset_UMAP"), alpha = 0.5) 

7.3.1 Seleccion genes importantes

7.3.1.1 mUMAP1+

# Crear un vector con todos los individuos etiquetados como "Sano"
estado <- rep(0, nrow(mat_exp_alz_genes))

# Reemplazar "Sano" con "Enfermo" para los individuos en la lista de enfermos
estado[rownames(mat_exp_alz_genes) %in% mUMAP1.pos] <- 1

# Añadir la nueva columna a la matriz de expresión de genes
mat_exp_alz_genes.RF <- data.frame(cbind(mat_exp_alz_genes, class = estado))

mat_exp_alz_genes.RF$class <- as.factor(mat_exp_alz_genes.RF$class)
library(caret)
library(randomForest)
library(doParallel)
num_cores <- detectCores() - 2
registerDoParallel(cores=num_cores)
modelLookup("ranger")

Aquí he utilizado el parámetro importance = “permutation”

# Definir una función para calcular la Accuracy balanceada
balanced_accuracy <- function(data, lev = NULL, model = NULL) {
  cm <- confusionMatrix(data$pred, data$obs)
  sensitivity <- cm$byClass["Sensitivity"]
  specificity <- cm$byClass["Specificity"]
  balanced_acc <- mean(c(sensitivity, specificity))
  c(BalancedAccuracy = balanced_acc)
}

# Definir las métricas de evaluación que queremos utilizar
RFControl <- trainControl(
  method = "cv", 
  number = 10,
  repeats = 10,
  summaryFunction = balanced_accuracy,
  allowParallel = TRUE,
  seeds = NULL, 
  returnResamp = "final",
)

set.seed(1234)
sqr.genes <- round(sqrt(ncol(mat_exp_alz_genes.RF)))

mygrid <- expand.grid(mtry = c(sqr.genes - 2, sqr.genes - 1, sqr.genes, sqr.genes+1, sqr.genes+2),
                      splitrule = c("gini", "extratrees"),
                      min.node.size = 1)
trees <- seq(500, 2000, 500) 

rf.cv.10 <- list() # lista vacia para meter los resultados de cada bucle
for (tree in trees){ # bucle en cada pasada hace un modelo con un número de arboles distintos
  modelo <- train(class ~., data=mat_exp_alz_genes.RF,
                  method="ranger",
                  tuneGrid=mygrid,
                  trControl=RFControl, 
                  ntree = tree,
                  metric = "BalancedAccuracy",
                  importance = "permutation" # o impurity permutation https://arxiv.org/abs/1407.7502
  )
  rf.cv.10[[paste(tree, "trees")]] <- modelo # metemos el modelo en la lista
}
# Hacemos dataframe con las métricas de cada modelo
datos_combinados <- rbind(
  data.frame(mtry = rf.cv.10$`500 trees`$results$mtry, splitrule = rf.cv.10$`500 trees`$results$splitrule, MetricValue = rf.cv.10$`500 trees`$results$BalancedAccuracy, Trees = "500 trees", Metric = "Balanced Accuracy"),
  data.frame(mtry = rf.cv.10$`1000 trees`$results$mtry, splitrule = rf.cv.10$`500 trees`$results$splitrule, MetricValue = rf.cv.10$`1000 trees`$results$BalancedAccuracy, Trees = "1000 trees", Metric = "Balanced Accuracy"),
  data.frame(mtry = rf.cv.10$`1500 trees`$results$mtry, splitrule = rf.cv.10$`500 trees`$results$splitrule, MetricValue = rf.cv.10$`1500 trees`$results$BalancedAccuracy, Trees = "1500 trees", Metric = "Balanced Accuracy"),
  data.frame(mtry = rf.cv.10$`2000 trees`$results$mtry, splitrule = rf.cv.10$`500 trees`$results$splitrule, MetricValue = rf.cv.10$`2000 trees`$results$BalancedAccuracy, Trees = "2000 trees", Metric = "Balanced Accuracy")
)

datos_combinados$Trees <- factor(datos_combinados$Trees, levels = c("500 trees", "1000 trees", "1500 trees", "2000 trees"))

datos_combinados %>%
  # filter(splitrule == "extratrees") %>%
  filter(Metric == "Balanced Accuracy") %>%
  ggplot(aes(x = mtry, y = MetricValue, color = Trees)) +
  geom_point(size = 4) +
  geom_line(linetype = "dashed") +
  theme_minimal() +
  labs(x = "Mtry", y = "Valor de la Métrica", title = "Comparación de Balanced Accuracy con splitrule") +
  facet_wrap(~ splitrule) 

models <- resamples(rf.cv.10)

summary(models)
## 
## Call:
## summary.resamples(object = models)
## 
## Models: 500 trees, 1000 trees, 1500 trees, 2000 trees 
## Number of resamples: 10 
## 
## BalancedAccuracy 
##            Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## 500 trees   0.5 0.6666667 0.6666667 0.7333333 0.8333333 1.0000000    0
## 1000 trees  0.5 0.6666667 0.7083333 0.7083333 0.8333333 0.8333333    0
## 1500 trees  0.5 0.5416667 0.7500000 0.7166667 0.8333333 1.0000000    0
## 2000 trees  0.5 0.5416667 0.6666667 0.7166667 0.8333333 1.0000000    0
important.genes.umap1pos <- varImp(rf.cv.10$`500 trees`)$importance %>% 
  rownames_to_column() %>%
  arrange(desc(Overall))

important.genes.umap1pos %>%
  head(50) %>%
  mutate(row_index = row_number(),  # Crear una columna numérica para el índice
         rowname = forcats::fct_inorder(rowname)) %>%
  ggplot(aes(x = row_index, y = Overall)) +  # Usar row_index en el eje X
    geom_point() +
    geom_line() +
    labs(x = "Número Genes", title = "Importancia genes UMAP1+ en RF") +
    theme_bw() +
    theme(axis.text.x = element_text(size = 10))

7.3.1.2 mUMAP1-

# Crear un vector con todos los individuos etiquetados como "Sano"
estado <- rep(0, nrow(mat_exp_alz_genes))

# Reemplazar "Sano" con "Enfermo" para los individuos en la lista de enfermos
estado[rownames(mat_exp_alz_genes) %in% mUMAP1.neg] <- 1

# Añadir la nueva columna a la matriz de expresión de genes
mat_exp_alz_genes.RF <- data.frame(cbind(mat_exp_alz_genes, class = estado))

mat_exp_alz_genes.RF$class <- as.factor(mat_exp_alz_genes.RF$class)

Aquí he utilizado el parámetro importance = “permutation”

# Definir una función para calcular la Accuracy balanceada
balanced_accuracy <- function(data, lev = NULL, model = NULL) {
  cm <- confusionMatrix(data$pred, data$obs)
  sensitivity <- cm$byClass["Sensitivity"]
  specificity <- cm$byClass["Specificity"]
  balanced_acc <- mean(c(sensitivity, specificity))
  c(BalancedAccuracy = balanced_acc)
}

# Definir las métricas de evaluación que queremos utilizar
RFControl <- trainControl(
  method = "cv", 
  number = 10,
  repeats = 10,
  summaryFunction = balanced_accuracy,
  allowParallel = TRUE,
  seeds = NULL, 
  returnResamp = "final",
)

set.seed(1234)
sqr.genes <- round(sqrt(ncol(mat_exp_alz_genes.RF)))

mygrid <- expand.grid(mtry = c(sqr.genes - 2, sqr.genes - 1, sqr.genes, sqr.genes+1, sqr.genes+2),
                      splitrule = c("gini", "extratrees"),
                      min.node.size = 1)
trees <- seq(500, 2000, 500) 

rf.cv.10 <- list() # lista vacia para meter los resultados de cada bucle
for (tree in trees){ # bucle en cada pasada hace un modelo con un número de arboles distintos
  modelo <- train(class ~., data=mat_exp_alz_genes.RF,
                  method="ranger",
                  tuneGrid=mygrid,
                  trControl=RFControl, 
                  ntree = tree,
                  metric = "BalancedAccuracy",
                  importance = "permutation" # o impurity permutation https://arxiv.org/abs/1407.7502
  )
  rf.cv.10[[paste(tree, "trees")]] <- modelo # metemos el modelo en la lista
}
# Hacemos dataframe con las métricas de cada modelo
datos_combinados <- rbind(
  data.frame(mtry = rf.cv.10$`500 trees`$results$mtry, splitrule = rf.cv.10$`500 trees`$results$splitrule, MetricValue = rf.cv.10$`500 trees`$results$BalancedAccuracy, Trees = "500 trees", Metric = "Balanced Accuracy"),
  data.frame(mtry = rf.cv.10$`1000 trees`$results$mtry, splitrule = rf.cv.10$`500 trees`$results$splitrule, MetricValue = rf.cv.10$`1000 trees`$results$BalancedAccuracy, Trees = "1000 trees", Metric = "Balanced Accuracy"),
  data.frame(mtry = rf.cv.10$`1500 trees`$results$mtry, splitrule = rf.cv.10$`500 trees`$results$splitrule, MetricValue = rf.cv.10$`1500 trees`$results$BalancedAccuracy, Trees = "1500 trees", Metric = "Balanced Accuracy"),
  data.frame(mtry = rf.cv.10$`2000 trees`$results$mtry, splitrule = rf.cv.10$`500 trees`$results$splitrule, MetricValue = rf.cv.10$`2000 trees`$results$BalancedAccuracy, Trees = "2000 trees", Metric = "Balanced Accuracy")
)

datos_combinados$Trees <- factor(datos_combinados$Trees, levels = c("500 trees", "1000 trees", "1500 trees", "2000 trees"))

datos_combinados %>%
  # filter(splitrule == "extratrees") %>%
  filter(Metric == "Balanced Accuracy") %>%
  ggplot(aes(x = mtry, y = MetricValue, color = Trees)) +
  geom_point(size = 4) +
  geom_line(linetype = "dashed") +
  theme_minimal() +
  labs(x = "Mtry", y = "Valor de la Métrica", title = "Comparación de Balanced Accuracy con splitrule") +
  facet_wrap(~ splitrule) 

models <- resamples(rf.cv.10)

summary(models)
## 
## Call:
## summary.resamples(object = models)
## 
## Models: 500 trees, 1000 trees, 1500 trees, 2000 trees 
## Number of resamples: 10 
## 
## BalancedAccuracy 
##            Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 500 trees   0.5     0.5    0.5  0.5     0.5  0.5    2
## 1000 trees  0.5     0.5    0.5  0.5     0.5  0.5    2
## 1500 trees  0.5     0.5    0.5  0.5     0.5  0.5    2
## 2000 trees  0.5     0.5    0.5  0.5     0.5  0.5    2
important.genes.umap1neg <- varImp(rf.cv.10$`500 trees`)$importance %>% 
  rownames_to_column() %>%
  arrange(desc(Overall))

important.genes.umap1neg %>%
  head(50) %>%
  mutate(row_index = row_number(),  # Crear una columna numérica para el índice
         rowname = forcats::fct_inorder(rowname)) %>%
  ggplot(aes(x = row_index, y = Overall)) +  # Usar row_index en el eje X
    geom_point() +
    geom_line() +
    labs(x = "Número Genes", title = "Importancia genes UMAP1- en RF") +
    theme_bw() +
    theme(axis.text.x = element_text(size = 10))

7.3.1.3 mUMAP2+

# Crear un vector con todos los individuos etiquetados como "Sano"
estado <- rep(0, nrow(mat_exp_alz_genes))

# Reemplazar "Sano" con "Enfermo" para los individuos en la lista de enfermos
estado[rownames(mat_exp_alz_genes) %in% mUMAP2.pos] <- 1

# Añadir la nueva columna a la matriz de expresión de genes
mat_exp_alz_genes.RF <- data.frame(cbind(mat_exp_alz_genes, class = estado))

mat_exp_alz_genes.RF$class <- as.factor(mat_exp_alz_genes.RF$class)

Aquí he utilizado el parámetro importance = “permutation”

# Definir una función para calcular la Accuracy balanceada
balanced_accuracy <- function(data, lev = NULL, model = NULL) {
  cm <- confusionMatrix(data$pred, data$obs)
  sensitivity <- cm$byClass["Sensitivity"]
  specificity <- cm$byClass["Specificity"]
  balanced_acc <- mean(c(sensitivity, specificity))
  c(BalancedAccuracy = balanced_acc)
}

# Definir las métricas de evaluación que queremos utilizar
RFControl <- trainControl(
  method = "cv", 
  number = 10,
  repeats = 10,
  summaryFunction = balanced_accuracy,
  allowParallel = TRUE,
  seeds = NULL, 
  returnResamp = "final",
)

set.seed(1234)
sqr.genes <- round(sqrt(ncol(mat_exp_alz_genes.RF)))

mygrid <- expand.grid(mtry = c(sqr.genes - 2, sqr.genes - 1, sqr.genes, sqr.genes+1, sqr.genes+2),
                      splitrule = c("gini", "extratrees"),
                      min.node.size = 1)
trees <- seq(500, 2000, 500) 

rf.cv.10 <- list() # lista vacia para meter los resultados de cada bucle
for (tree in trees){ # bucle en cada pasada hace un modelo con un número de arboles distintos
  modelo <- train(class ~., data=mat_exp_alz_genes.RF,
                  method="ranger",
                  tuneGrid=mygrid,
                  trControl=RFControl, 
                  ntree = tree,
                  metric = "BalancedAccuracy",
                  importance = "permutation" # o impurity permutation https://arxiv.org/abs/1407.7502
  )
  rf.cv.10[[paste(tree, "trees")]] <- modelo # metemos el modelo en la lista
}
# Hacemos dataframe con las métricas de cada modelo
datos_combinados <- rbind(
  data.frame(mtry = rf.cv.10$`500 trees`$results$mtry, splitrule = rf.cv.10$`500 trees`$results$splitrule, MetricValue = rf.cv.10$`500 trees`$results$BalancedAccuracy, Trees = "500 trees", Metric = "Balanced Accuracy"),
  data.frame(mtry = rf.cv.10$`1000 trees`$results$mtry, splitrule = rf.cv.10$`500 trees`$results$splitrule, MetricValue = rf.cv.10$`1000 trees`$results$BalancedAccuracy, Trees = "1000 trees", Metric = "Balanced Accuracy"),
  data.frame(mtry = rf.cv.10$`1500 trees`$results$mtry, splitrule = rf.cv.10$`500 trees`$results$splitrule, MetricValue = rf.cv.10$`1500 trees`$results$BalancedAccuracy, Trees = "1500 trees", Metric = "Balanced Accuracy"),
  data.frame(mtry = rf.cv.10$`2000 trees`$results$mtry, splitrule = rf.cv.10$`500 trees`$results$splitrule, MetricValue = rf.cv.10$`2000 trees`$results$BalancedAccuracy, Trees = "2000 trees", Metric = "Balanced Accuracy")
)

datos_combinados$Trees <- factor(datos_combinados$Trees, levels = c("500 trees", "1000 trees", "1500 trees", "2000 trees"))

datos_combinados %>%
  # filter(splitrule == "extratrees") %>%
  filter(Metric == "Balanced Accuracy") %>%
  ggplot(aes(x = mtry, y = MetricValue, color = Trees)) +
  geom_point(size = 4) +
  geom_line(linetype = "dashed") +
  theme_minimal() +
  labs(x = "Mtry", y = "Valor de la Métrica", title = "Comparación de Balanced Accuracy con splitrule") +
  facet_wrap(~ splitrule) 

models <- resamples(rf.cv.10)

summary(models)
## 
## Call:
## summary.resamples(object = models)
## 
## Models: 500 trees, 1000 trees, 1500 trees, 2000 trees 
## Number of resamples: 10 
## 
## BalancedAccuracy 
##                 Min. 1st Qu. Median      Mean   3rd Qu.     Max. NA's
## 500 trees  0.4848485     0.5    0.5 0.5734848 0.6875000 0.750000    0
## 1000 trees 0.5000000     0.5    0.5 0.5734375 0.5000000 0.984375    0
## 1500 trees 0.4843750     0.5    0.5 0.5734375 0.6875000 0.750000    0
## 2000 trees 0.5000000     0.5    0.5 0.5984375 0.6757812 1.000000    0
important.genes.umap2pos <- varImp(rf.cv.10$`2000 trees`)$importance %>% 
  rownames_to_column() %>%
  arrange(desc(Overall))

important.genes.umap2pos %>%
  head(50) %>%
  mutate(row_index = row_number(),  # Crear una columna numérica para el índice
         rowname = forcats::fct_inorder(rowname)) %>%
  ggplot(aes(x = row_index, y = Overall)) +  # Usar row_index en el eje X
    geom_point() +
    geom_line() +
    labs(x = "Número Genes", title = "Importancia genes UMAP1- en RF") +
    theme_bw() +
    theme(axis.text.x = element_text(size = 10))

7.3.1.4 mUMAP2-

# Crear un vector con todos los individuos etiquetados como "Sano"
estado <- rep(0, nrow(mat_exp_alz_genes))

# Reemplazar "Sano" con "Enfermo" para los individuos en la lista de enfermos
estado[rownames(mat_exp_alz_genes) %in% mUMAP2.neg] <- 1

# Añadir la nueva columna a la matriz de expresión de genes
mat_exp_alz_genes.RF <- data.frame(cbind(mat_exp_alz_genes, class = estado))

mat_exp_alz_genes.RF$class <- as.factor(mat_exp_alz_genes.RF$class)

Aquí he utilizado el parámetro importance = “permutation”

# Definir una función para calcular la Accuracy balanceada
balanced_accuracy <- function(data, lev = NULL, model = NULL) {
  cm <- confusionMatrix(data$pred, data$obs)
  sensitivity <- cm$byClass["Sensitivity"]
  specificity <- cm$byClass["Specificity"]
  balanced_acc <- mean(c(sensitivity, specificity))
  c(BalancedAccuracy = balanced_acc)
}

# Definir las métricas de evaluación que queremos utilizar
RFControl <- trainControl(
  method = "cv", 
  number = 10,
  repeats = 10,
  summaryFunction = balanced_accuracy,
  allowParallel = TRUE,
  seeds = NULL, 
  returnResamp = "final",
)

set.seed(1234)
sqr.genes <- round(sqrt(ncol(mat_exp_alz_genes.RF)))

mygrid <- expand.grid(mtry = c(sqr.genes - 2, sqr.genes - 1, sqr.genes, sqr.genes+1, sqr.genes+2),
                      splitrule = c("gini", "extratrees"),
                      min.node.size = 1)
trees <- seq(500, 2000, 500) 

rf.cv.10 <- list() # lista vacia para meter los resultados de cada bucle
for (tree in trees){ # bucle en cada pasada hace un modelo con un número de arboles distintos
  modelo <- train(class ~., data=mat_exp_alz_genes.RF,
                  method="ranger",
                  tuneGrid=mygrid,
                  trControl=RFControl, 
                  ntree = tree,
                  metric = "BalancedAccuracy",
                  importance = "permutation" # o impurity permutation https://arxiv.org/abs/1407.7502
  )
  rf.cv.10[[paste(tree, "trees")]] <- modelo # metemos el modelo en la lista
}
# Hacemos dataframe con las métricas de cada modelo
datos_combinados <- rbind(
  data.frame(mtry = rf.cv.10$`500 trees`$results$mtry, splitrule = rf.cv.10$`500 trees`$results$splitrule, MetricValue = rf.cv.10$`500 trees`$results$BalancedAccuracy, Trees = "500 trees", Metric = "Balanced Accuracy"),
  data.frame(mtry = rf.cv.10$`1000 trees`$results$mtry, splitrule = rf.cv.10$`500 trees`$results$splitrule, MetricValue = rf.cv.10$`1000 trees`$results$BalancedAccuracy, Trees = "1000 trees", Metric = "Balanced Accuracy"),
  data.frame(mtry = rf.cv.10$`1500 trees`$results$mtry, splitrule = rf.cv.10$`500 trees`$results$splitrule, MetricValue = rf.cv.10$`1500 trees`$results$BalancedAccuracy, Trees = "1500 trees", Metric = "Balanced Accuracy"),
  data.frame(mtry = rf.cv.10$`2000 trees`$results$mtry, splitrule = rf.cv.10$`500 trees`$results$splitrule, MetricValue = rf.cv.10$`2000 trees`$results$BalancedAccuracy, Trees = "2000 trees", Metric = "Balanced Accuracy")
)

datos_combinados$Trees <- factor(datos_combinados$Trees, levels = c("500 trees", "1000 trees", "1500 trees", "2000 trees"))

datos_combinados %>%
  # filter(splitrule == "extratrees") %>%
  filter(Metric == "Balanced Accuracy") %>%
  ggplot(aes(x = mtry, y = MetricValue, color = Trees)) +
  geom_point(size = 4) +
  geom_line(linetype = "dashed") +
  theme_minimal() +
  labs(x = "Mtry", y = "Valor de la Métrica", title = "Comparación de Balanced Accuracy con splitrule") +
  facet_wrap(~ splitrule) 

models <- resamples(rf.cv.10)

summary(models)
## 
## Call:
## summary.resamples(object = models)
## 
## Models: 500 trees, 1000 trees, 1500 trees, 2000 trees 
## Number of resamples: 10 
## 
## BalancedAccuracy 
##                 Min.   1st Qu.    Median      Mean 3rd Qu. Max. NA's
## 500 trees  0.6666667 0.8710938 1.0000000 0.9234375       1    1    0
## 1000 trees 0.7500000 0.7708333 1.0000000 0.9083333       1    1    0
## 1500 trees 0.7500000 0.7500000 0.9921875 0.8984375       1    1    0
## 2000 trees 0.7500000 0.8333333 0.9921875 0.9151042       1    1    0
important.genes.umap2neg <- varImp(rf.cv.10$`500 trees`)$importance %>% 
  rownames_to_column() %>%
  arrange(desc(Overall))

important.genes.umap2neg %>%
  head(50) %>%
  mutate(row_index = row_number(),  # Crear una columna numérica para el índice
         rowname = forcats::fct_inorder(rowname)) %>%
  ggplot(aes(x = row_index, y = Overall)) +  # Usar row_index en el eje X
    geom_point() +
    geom_line() +
    labs(x = "Número Genes", title = "Importancia genes UMAP1- en RF") +
    theme_bw() +
    theme(axis.text.x = element_text(size = 10))

### Enriquecimiento

7.3.1.5 GO

important.genes.top <- important.genes.umap1pos%>%
  head(., 40) 

matriz.AD.minus.important.genes <- mat_exp_alz_genes[!colnames(mat_exp_alz_genes) %in% important.genes.top$rowname, ]

top_genes <- important.genes.top$rowname

gp_up_ordered = gost(list("Specific UMAP1+ genes" = important.genes.top$rowname, 
                          "Matriz genes" = colnames(matriz.AD.minus.important.genes)) , 
                     organism = "hsapiens",
                     sources = "GO", 
                     multi_query = T,
                     highlight = T)

gostplot(gp_up_ordered, interactive = T, capped = F)
top_genes_umap1pos.gprofiler = gconvert(important.genes.top$rowname)
top_genes_matriz.gprofiler = gconvert(colnames(matriz.AD.minus.important.genes))

# enrichment analysis using gene names
multi_gp = gost(list("Specific UMAP1+ genes" = top_genes_umap1pos.gprofiler$name,
                     "Matriz genes" = top_genes_matriz.gprofiler$name), 
          multi_query = FALSE, 
          evcodes = TRUE, 
          sources = "GO", 
          highlight = T)

multi_gp$result <- multi_gp$result %>%
  filter(highlighted == T)

# modify the g:Profiler data frame
gp_mod = multi_gp$result[,c("query", "source", "term_id",
                                "term_name", "p_value", "query_size", 
                                "intersection_size", "term_size", 
                                "effective_domain_size", "intersection")]
gp_mod$GeneRatio = paste0(gp_mod$intersection_size,  "/", gp_mod$query_size)
gp_mod$BgRatio = paste0(gp_mod$term_size, "/", gp_mod$effective_domain_size)
names(gp_mod) = c("Cluster", "Category", "ID", "Description", "p.adjust", 
                    "query_size", "Count", "term_size", "effective_domain_size", 
                    "geneID", "GeneRatio", "BgRatio")
gp_mod$geneID = gsub(",", "/", gp_mod$geneID)

# define as enrichResult object
gp_mod_enrich  = new("enrichResult", result = gp_mod)

gp_mod_enrich@result <- gp_mod_enrich@result %>%
  mutate(generationumerico = Count / query_size) %>%
  arrange(desc(Cluster)) 

enrichplot::dotplot(gp_mod_enrich, showCategory = 20, x = "Cluster", size = "GeneRatio")

7.3.1.6 KEGG

gp_up_ordered = gost(list("Specific UMAP1+ genes" = important.genes.top$rowname, 
                          "Matriz genes" = colnames(matriz.AD.minus.important.genes)) , 
                     organism = "hsapiens",
                     sources = "KEGG", 
                     multi_query = T,
                     highlight = T)

gostplot(gp_up_ordered, interactive = T, capped = T)
top_genes_umap1pos.gprofiler = gconvert(important.genes.top$rowname)
top_genes_matriz.gprofiler = gconvert(colnames(matriz.AD.minus.important.genes))

# enrichment analysis using gene names
multi_gp = gost(list("Specific UMAP1+ genes" = top_genes_umap1pos.gprofiler$name,
                     "Matriz genes" = top_genes_matriz.gprofiler$name), 
          multi_query = FALSE, 
          evcodes = TRUE, 
          sources = "KEGG")

# modify the g:Profiler data frame
gp_mod = multi_gp$result[,c("query", "source", "term_id",
                                "term_name", "p_value", "query_size", 
                                "intersection_size", "term_size", 
                                "effective_domain_size", "intersection")]
gp_mod$GeneRatio = paste0(gp_mod$intersection_size,  "/", gp_mod$query_size)
gp_mod$BgRatio = paste0(gp_mod$term_size, "/", gp_mod$effective_domain_size)
names(gp_mod) = c("Cluster", "Category", "ID", "Description", "p.adjust", 
                    "query_size", "Count", "term_size", "effective_domain_size", 
                    "geneID", "GeneRatio", "BgRatio")
gp_mod$geneID = gsub(",", "/", gp_mod$geneID)

# define as enrichResult object
gp_mod_enrich  = new("enrichResult", result = gp_mod)

gp_mod_enrich@result <- gp_mod_enrich@result %>%
  mutate(generationumerico = Count / query_size) %>%
  arrange(desc(generationumerico)) 

enrichplot::dotplot(gp_mod_enrich, showCategory = 30, x = "Cluster", size = "GeneRatio")

7.3.1.7 REAC

gp_up_ordered = gost(list("Specific UMAP1+ genes" = important.genes.top$rowname, 
                          "Matriz genes" = colnames(matriz.AD.minus.important.genes)) , 
                     organism = "hsapiens",
                     sources = "REAC", 
                     multi_query = T,
                     highlight = T)

gostplot(gp_up_ordered, interactive = T, capped = F)
top_genes_umap1pos.gprofiler = gconvert(important.genes.top$rowname)
top_genes_matriz.gprofiler = gconvert(colnames(matriz.AD.minus.important.genes))

# enrichment analysis using gene names
multi_gp = gost(list("Specific UMAP1+ genes" = top_genes_umap1pos.gprofiler$name,
                     "Matriz genes" = top_genes_matriz.gprofiler$name), 
          multi_query = FALSE, 
          evcodes = TRUE, 
          sources = "REAC")

# modify the g:Profiler data frame
gp_mod = multi_gp$result[,c("query", "source", "term_id",
                                "term_name", "p_value", "query_size", 
                                "intersection_size", "term_size", 
                                "effective_domain_size", "intersection")]
gp_mod$GeneRatio = paste0(gp_mod$intersection_size,  "/", gp_mod$query_size)
gp_mod$BgRatio = paste0(gp_mod$term_size, "/", gp_mod$effective_domain_size)
names(gp_mod) = c("Cluster", "Category", "ID", "Description", "p.adjust", 
                    "query_size", "Count", "term_size", "effective_domain_size", 
                    "geneID", "GeneRatio", "BgRatio")
gp_mod$geneID = gsub(",", "/", gp_mod$geneID)

# define as enrichResult object
gp_mod_enrich  = new("enrichResult", result = gp_mod)

gp_mod_enrich@result <- gp_mod_enrich@result %>%
  mutate(generationumerico = Count / query_size) %>%
  arrange(desc(Cluster)) 

enrichplot::dotplot(gp_mod_enrich, showCategory = 20, x = "Cluster", size = "GeneRatio")

8 Comparaciones

8.1 Casos

8.1.1 Muestras entre técnicas

outliers.PCA <- c(rownames(outlierspc1), rownames(outlierspc2))
outliers.PCA <- outliers.PCA[outliers.PCA %in% rownames(covs2.casos)]

outliers.tSNE <- c(rownames(outliers.tsne1), rownames(outliers.tsne2))
outliers.tSNE <- outliers.tSNE[outliers.tSNE %in% rownames(covs2.casos)]

outliers.UMAP <- c(rownames(outliers.umap1), rownames(outliers.umap2))
outliers.UMAP <- outliers.UMAP[outliers.UMAP %in% rownames(covs2.casos)]

all.outliers <- list(PCA.KEGG = outliers.PCA,
                     tSNE.KEGG = outliers.tSNE,
                     UMAP.KEGG = outliers.UMAP)

max_length <- max(sapply(all.outliers, length))

# Función para normalizar las longitudes de las listas
normalize_length <- function(x, max_length) {
  length(x) <- max_length  # Esto rellenará con NA si la lista es más corta que max_length
  x
}

# Aplicar la normalización a cada lista en 'all.outliers'
all.outliers <- lapply(all.outliers, normalize_length, max_length = max_length)

# Convertir la lista normalizada en un data.frame
all.outliers <- data.frame(all.outliers)

8.1.2 Diagrama de Venn

lista.outliers <- lapply(all.outliers, function(x) unique(na.omit(x)))
# Generar el diagrama de Venn
venn.plot <- venn.diagram(
  x = lista.outliers[1:3],
  category.names = c("PCA", "tSNE", "UMAP"),
  filename = NULL,
  output = FALSE,  
  fill = c("#440154ff", '#21908dff', '#fde725ff'),
  cex = 1,  # Aumenta el tamaño del texto
  fontface = "bold",
  cat.cex = 1,  # Aumenta el tamaño del texto de las categorías
  cat.fontface = "bold",
  cat.default.pos = "text",
  cat.pos = c(0, 0, 0), #posición de las categorías
  cat.dist = 0.05, #distancia de las categorías
  rotation.degree = 0, 
  margin = 0.05, # hacerla más pequeña
  lwd = 0.5,
  lty = "dashed", # Estilo de línea discontinua
  edge.col = "grey", # Color de los bordes
  main = paste0("Diagrama Venn KEGG geneset"),
  main.fontface = "bold", 
  main.cex = 2,
  main.pos = c(0.5, 1)
)

# Calcular la intersección
interseccion_ABC <- Reduce(intersect, lista.outliers[1:3])

# Dibuja el diagrama de Venn y el texto de la intersección
grid.newpage()  # Asegura que el lienzo esté limpio
grid.draw(venn.plot)

# interseccion_AB <- Reduce(intersect, lista.outliers[1:2])
# grid.text(label = paste(gsub("_.*","",interseccion_AB[!(interseccion_AB %in% interseccion_ABC)]), collapse = "\n"), 
#           x = 0.68, 
#           y = 0.33, 
#           gp = gpar(fontsize = 8, 
#                     col = "black",
#                     fontface = "italic"))
# 
# interseccion_AC <- Reduce(intersect, lista.outliers[c(1,3)])
# grid.text(label = paste(gsub("_.*","",interseccion_AC[!(interseccion_AC %in% interseccion_ABC)]), collapse = "\n"), 
#           x = 0.365, 
#           y = 0.25, 
#           gp = gpar(fontsize = 8, 
#                     col = "black",
#                     fontface = "italic"))

8.1.3 UpSetR

upset(fromList(lista.outliers), 
      order.by = "freq",
      sets.x.label = "Patients by method",
      mainbar.y.label = "Patient intersections",
      point.size = 3.5, line.size = 1.5,
      text.scale = c(1.3, 1.3, 1, 1, 1.5, 1.3),
      empty.intersections = "on",
      queries = list(list(query = intersects, params = list("PCA.KEGG", 
    "tSNE.KEGG", "UMAP.KEGG"), color = "orange", active = T)),
      )
# Agregar el título usando grid.text
grid.text("Casos", x = 0.1, y = 0.95, gp = gpar(fontsize = 20, fontface = "bold"))

reactable(covs2.casos %>%
  filter(sampleset_PCA != "Not in both" & sampleset_tSNE != "Not in both" & sampleset_UMAP != "Not in both")
)

8.2 Casos y sanos

8.2.1 Muestras entre técnicas

outliers.PCA <- c(rownames(outlierspc1), rownames(outlierspc2))

outliers.tSNE <- c(rownames(outliers.tsne1), rownames(outliers.tsne2))

outliers.UMAP <- c(rownames(outliers.umap1), rownames(outliers.umap2))

all.outliers <- list(PCA.KEGG = outliers.PCA,
                     tSNE.KEGG = outliers.tSNE,
                     UMAP.KEGG = outliers.UMAP)

max_length <- max(sapply(all.outliers, length))

# Función para normalizar las longitudes de las listas
normalize_length <- function(x, max_length) {
  length(x) <- max_length  # Esto rellenará con NA si la lista es más corta que max_length
  x
}

# Aplicar la normalización a cada lista en 'all.outliers'
all.outliers <- lapply(all.outliers, normalize_length, max_length = max_length)

# Convertir la lista normalizada en un data.frame
all.outliers <- data.frame(all.outliers)

8.2.2 Diagrama de Venn

lista.outliers <- lapply(all.outliers, function(x) unique(na.omit(x)))
# Generar el diagrama de Venn
venn.plot <- venn.diagram(
  x = lista.outliers[1:3],
  category.names = c("PCA", "tSNE", "UMAP"),
  filename = NULL,
  output = FALSE,  
  fill = c("#440154ff", '#21908dff', '#fde725ff'),
  cex = 1,  # Aumenta el tamaño del texto
  fontface = "bold",
  cat.cex = 1,  # Aumenta el tamaño del texto de las categorías
  cat.fontface = "bold",
  cat.default.pos = "text",
  cat.pos = c(0, 0, 0), #posición de las categorías
  cat.dist = 0.05, #distancia de las categorías
  rotation.degree = 0, 
  margin = 0.05, # hacerla más pequeña
  lwd = 0.5,
  lty = "dashed", # Estilo de línea discontinua
  edge.col = "grey", # Color de los bordes
  main = paste0("Diagrama Venn KEGG geneset"),
  main.fontface = "bold", 
  main.cex = 2,
  main.pos = c(0.5, 1)
)

# Calcular la intersección
interseccion_ABC <- Reduce(intersect, lista.outliers[1:3])

# Dibuja el diagrama de Venn y el texto de la intersección
grid.newpage()  # Asegura que el lienzo esté limpio
grid.draw(venn.plot)

# interseccion_AB <- Reduce(intersect, lista.outliers[1:2])
# grid.text(label = paste(gsub("_.*","",interseccion_AB[!(interseccion_AB %in% interseccion_ABC)]), collapse = "\n"), 
#           x = 0.68, 
#           y = 0.33, 
#           gp = gpar(fontsize = 8, 
#                     col = "black",
#                     fontface = "italic"))
# 
# interseccion_AC <- Reduce(intersect, lista.outliers[c(1,3)])
# grid.text(label = paste(gsub("_.*","",interseccion_AC[!(interseccion_AC %in% interseccion_ABC)]), collapse = "\n"), 
#           x = 0.365, 
#           y = 0.25, 
#           gp = gpar(fontsize = 8, 
#                     col = "black",
#                     fontface = "italic"))

8.2.3 UpSetR

upset(fromList(lista.outliers), 
      order.by = "freq",
      sets.x.label = "Patients by method",
      mainbar.y.label = "Patient intersections",
      point.size = 3.5, line.size = 1.5,
      text.scale = c(1.3, 1.3, 1, 1, 1.5, 1.3),
      empty.intersections = "on",
      queries = list(list(query = intersects, params = list("PCA.KEGG", 
    "tSNE.KEGG", "UMAP.KEGG"), color = "orange", active = T)),
      )
# Agregar el título usando grid.text
grid.text("Casos y Sanos", x = 0.2, y = 0.95, gp = gpar(fontsize = 20, fontface = "bold"))

reactable(covs2 %>%
  filter(sampleset_PCA != "Not in both" & sampleset_tSNE != "Not in both" & sampleset_UMAP != "Not in both")
)